{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch07_dataQC.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Modern statistics: Intuition, Math, Python, R\n",
    "## Mike X Cohen (sincxpress.com)\n",
    "### https://www.amazon.com/dp/B0CQRGWGLY\n",
    "#### Code for chapter 7\n",
    "\n",
    "---\n",
    "\n",
    "# About this code file:\n",
    "\n",
    "### This notebook will reproduce most of the figures in this chapter (some figures were made in Inkscape), and illustrate the statistical concepts explained in the text. The point of providing the code is not just for you to recreate the figures, but for you to modify, adapt, explore, and experiment with the code.\n",
    "\n",
    "### Solutions to all exercises are at the bottom of the notebook.\n",
    "\n",
    "#### This code was written in google-colab. The notebook may require some modifications if you use a different IDE."
   ],
   "metadata": {
    "id": "yeVh6hm2ezCO"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# import libraries and define global settings\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import scipy.stats as stats\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# define global figure properties used for publication\n",
    "import matplotlib_inline.backend_inline\n",
    "matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format\n",
    "plt.rcParams.update({'font.size':14,             # font size\n",
    "                     'savefig.dpi':300,          # output resolution\n",
    "                     'axes.titlelocation':'left',# title location\n",
    "                     'axes.spines.right':False,  # remove axis bounding box\n",
    "                     'axes.spines.top':False,    # remove axis bounding box\n",
    "                     })"
   ],
   "metadata": {
    "id": "Bcz2Oz9IAG2T"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "U8o84xpGAGzv"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 7.2: What to look for in visual inspection of data"
   ],
   "metadata": {
    "id": "ju8oyG7Gc5tS"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "_,axs = plt.subplots(2,2,figsize=(12,6))\n",
    "\n",
    "# panel A: unexpected range\n",
    "x = np.concatenate((np.random.randn(20),np.random.randn(80)*30),axis=0)\n",
    "axs[0,0].plot(x,'ks',markersize=10,markerfacecolor=(.7,.7,.7),alpha=.8)\n",
    "axs[0,0].set(xlabel='Data index',xticks=[],yticks=[],ylabel='Data value')\n",
    "axs[0,0].set_title(r'$\\bf{A}$)  Unexpected data range')\n",
    "\n",
    "# panel B: distribution shape\n",
    "x = np.concatenate((5+np.random.randn(150),np.exp(1+np.random.randn(150))),axis=0)\n",
    "axs[0,1].hist(x,bins='fd',edgecolor='k',facecolor=(.7,.7,.7))\n",
    "axs[0,1].set(xlabel='Data value',xticks=[],yticks=[],ylabel='Count')\n",
    "axs[0,1].set_title(r'$\\bf{B}$)  Nonstandard distribution')\n",
    "\n",
    "# panel C: mixed datasets\n",
    "x = np.concatenate((4+np.random.randn(150),np.random.randn(150)-4),axis=0)\n",
    "axs[1,0].hist(x,bins=50,edgecolor='k',facecolor=(.7,.7,.7))\n",
    "axs[1,0].set(xlabel='Data value',xticks=[],yticks=[],ylabel='Count')\n",
    "axs[1,0].set_title(r'$\\bf{C}$)  Mixed dataset')\n",
    "\n",
    "# panel D: outliers\n",
    "x = np.random.randn(150)\n",
    "x[60] = 10\n",
    "x[84] = 14\n",
    "axs[1,1].plot(x,'ks',markersize=10,markerfacecolor=(.7,.7,.7),alpha=.8)\n",
    "axs[1,1].set(xlabel='Data index',xticks=[],yticks=[],ylabel='Data value')\n",
    "axs[1,1].set_title(r'$\\bf{B}$)  Outliers')\n",
    "\n",
    "# export\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_qualInspection.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "g7Wul699c7BY"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "r11QkM3bc6Ip"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 7.3: Example of dataset with outliers"
   ],
   "metadata": {
    "id": "wyWRMY4daiYx"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# Create normally distributed data\n",
    "N = 100\n",
    "data = np.random.randn(N)\n",
    "\n",
    "# and add two random outliers in random positions\n",
    "data[np.random.choice(np.arange(N),2)] = np.random.uniform(2,3,2)**2\n",
    "\n",
    "# and plot\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.plot(data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))\n",
    "plt.xlim([-2,N+1])\n",
    "plt.xlabel('Data index')\n",
    "plt.ylabel('Data value')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_example2outliers.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "ZeKOmDvYLbLk"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "6d_wb0O9aiIs"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 7.5: Z-score method for identifying outliers"
   ],
   "metadata": {
    "id": "8XaFnokCgeOq"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# outlier threshold\n",
    "zThreshold = 3.29\n",
    "\n",
    "# create some raw data\n",
    "N = 135\n",
    "data = np.exp(np.random.randn(N)/2) + 5\n",
    "\n",
    "# zscore the data\n",
    "dataZ = (data-np.mean(data)) / np.std(data,ddof=1)\n",
    "\n",
    "# identify data indices containing outliers\n",
    "outliers = np.where(np.abs(dataZ)>zThreshold)[0]\n",
    "\n",
    "\n",
    "# and plot\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "axs[0].plot(data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))\n",
    "axs[0].set(xlim=[-2,N+1],xlabel='Data index',ylabel='Data value')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Original data')\n",
    "\n",
    "\n",
    "axs[1].plot(dataZ,'ks',markersize=10,markerfacecolor=(.9,.9,.9))\n",
    "axs[1].axhline(zThreshold,linestyle='--',color=(.9,.9,.9))\n",
    "axs[1].plot(outliers,dataZ[outliers],'kx',markersize=10,markeredgewidth=2)\n",
    "axs[1].set(xlim=[-3,N+2],xlabel='Data index',ylabel='Transformed data value')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Z-transformed data')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_zMethodOutliers.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "FlAs32WngeL_"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "nRU3YF6S3mJn"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 7.6: Impact of removing outliers on z-values"
   ],
   "metadata": {
    "id": "t6ppCM6ZCFo6"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create some raw data\n",
    "N = 10 # sample size\n",
    "data = np.exp(np.random.randn(N)/2) + 5\n",
    "data[-1] = np.max(data)+2 # impose an outlier (at the end for convenience)\n",
    "xvals = np.arange(N)\n",
    "\n",
    "dataZ1 = (data-np.mean(data)) / np.std(data,ddof=1)\n",
    "dataZ2 = (data[:-1]-np.mean(data[:-1])) / np.std(data[:-1],ddof=1)\n",
    "\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "axs[0].plot(xvals,data,'ks',markersize=10,markerfacecolor=(.7,.7,.7))\n",
    "axs[0].set(xticks=[],xlabel='Data index',ylabel='Raw data value')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Raw data')\n",
    "\n",
    "axs[1].plot(xvals,dataZ1,'ks',markersize=10,markerfacecolor=(.7,.7,.7),label='Z with outlier')\n",
    "axs[1].plot(xvals[:-1],dataZ2,'ko',markersize=10,markerfacecolor=(.5,.5,.5),label='Z without outlier')\n",
    "axs[1].set(xticks=[],xlabel='Data index',ylabel='Transformed data value')\n",
    "axs[1].legend()\n",
    "axs[1].set_title(r'$\\bf{B}$)  Z-transformed data')\n",
    "\n",
    "# draw lines connection pre/post-removal values\n",
    "for d,z,x in zip(dataZ1[:-1],dataZ2,xvals[:-1]):\n",
    "  axs[1].plot([x,x],[d,z],':',color=(.7,.7,.7),zorder=-10)\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_recalculatingZ.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "v3RAWq6_CFuC"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "rlwnukrBCFxP"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 7.8: Data trimming"
   ],
   "metadata": {
    "id": "jTraoliSwJ75"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "N = 74\n",
    "data = np.random.randn(N)**3\n",
    "\n",
    "# find largest and smallest values\n",
    "k = 2\n",
    "sortidx = np.argsort(data)\n",
    "minvals = sortidx[:k]\n",
    "maxvals = sortidx[-k:]\n",
    "\n",
    "_,axs = plt.subplots(2,1,figsize=(8,6))\n",
    "axs[0].plot(data,'ks',markersize=10,markerfacecolor=(.9,.9,.9))\n",
    "axs[0].plot(minvals,data[minvals],'kx',markersize=10,markeredgewidth=2)\n",
    "axs[0].plot(maxvals,data[maxvals],'kx',markersize=10,markeredgewidth=2)\n",
    "axs[0].set_title(r'$\\bf{A}$)  Data with k-extreme points trimmed')\n",
    "\n",
    "\n",
    "# create a Gaussian probability curve for the panel B\n",
    "x = np.linspace(-4,4,401)\n",
    "gpdf = stats.norm.pdf(x)\n",
    "\n",
    "# the find the indices of the 2.5% and 97.5%\n",
    "lbndi = np.argmin(np.abs(x-stats.norm.ppf(.025))) # lbndi = Lower BouND Index\n",
    "ubndi = np.argmin(np.abs(x-stats.norm.ppf(1-.025)))\n",
    "\n",
    "\n",
    "# plot the probability function and the vertical lines\n",
    "axs[1].plot(x,gpdf,'k',linewidth=2)\n",
    "axs[1].axvline(x[lbndi],color=(.5,.5,.5),linewidth=.5,linestyle='--')\n",
    "axs[1].axvline(x[ubndi],color=(.5,.5,.5),linewidth=.5,linestyle='--')\n",
    "axs[1].set(xlim=x[[0,-1]],ylim=[0,.42])\n",
    "axs[1].set_title(r'$\\bf{B}$)  Histogram showing trimmed areas')\n",
    "\n",
    "# now create patches for the rejected area\n",
    "axs[1].fill_between(x[:lbndi+1],gpdf[:lbndi+1],color='k',alpha=.4)\n",
    "axs[1].fill_between(x[ubndi:],gpdf[ubndi:],color='k',alpha=.4)\n",
    "\n",
    "\n",
    "# and save\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_trimming.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "mTmgkLmP-zRM"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "qYMkKK4OSnh0"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 1"
   ],
   "metadata": {
    "id": "7cG8Q6YRrnxX"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "## iterative method\n",
    "# Note about this code: Because of random numbers, you are not guaranteed to get a result\n",
    "# that highlights the method. Try running the code several times.\n",
    "\n",
    "N = 30\n",
    "data = np.random.randn(N)\n",
    "data[data<-1] = data[data<-1]+2\n",
    "data[data>1.5] = data[data>1.5]**2; # try to force a few outliers\n",
    "\n",
    "\n",
    "# pick a lenient threshold just for illustration\n",
    "zscorethresh = 2\n",
    "dataZ = (data-np.mean(data)) / np.std(data,ddof=1)\n",
    "\n",
    "plt.figure(figsize=(10,4))\n",
    "\n",
    "colorz = 'brkmc'\n",
    "numiters = 0 # iteration counter\n",
    "while True:\n",
    "\n",
    "  # convert to z\n",
    "  datamean = np.nanmean(dataZ)\n",
    "  datastd  = np.nanstd(dataZ,ddof=1)\n",
    "  dataZ = (dataZ-datamean) / datastd\n",
    "\n",
    "  # find data values to remove\n",
    "  toremove = dataZ>zscorethresh\n",
    "\n",
    "  # break out of while loop if no points to remove\n",
    "  if sum(toremove)==0:\n",
    "    break\n",
    "  else:\n",
    "    # otherwise, mark the outliers in the plot\n",
    "    plt.plot(np.where(toremove)[0]+numiters/5,dataZ[toremove],'%sx'%colorz[numiters],\n",
    "             markersize=12,markeredgewidth=3)\n",
    "    dataZ[toremove] = np.nan\n",
    "\n",
    "  # replot\n",
    "  plt.plot(np.arange(N)+numiters/5,dataZ,linestyle='None',marker=f'${numiters}$',markersize=12,\n",
    "           color=colorz[numiters])\n",
    "\n",
    "  # update counter\n",
    "  numiters = numiters + 1\n",
    "\n",
    "\n",
    "plt.ylabel('Z-score')\n",
    "plt.xlabel('Data index')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_iterativeZmethod.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "60DsJNq-wJ2D"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "I96aT_Flq1tZ"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 2"
   ],
   "metadata": {
    "id": "jSaZWSyHSne-"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create data\n",
    "N = 10000\n",
    "Y = np.exp(np.sin(np.random.randn(N)))\n",
    "\n",
    "# make a copy of the data to manipulate\n",
    "Yc = Y.copy()\n",
    "\n",
    "# not specified in the instructions, but always a good idea to inspect the data!\n",
    "plt.hist(Y,bins=40);"
   ],
   "metadata": {
    "id": "asRLmU10SncG"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# percent to remove (two-tailed)\n",
    "k = 4\n",
    "\n",
    "# convert that to a number of data points to remove from each tail\n",
    "pnts2nan = int( (k/2)/100 * N ) # with stated parameters, this should be 200\n",
    "\n",
    "# find the data sorting\n",
    "sort_idx = np.argsort(Y)\n",
    "\n",
    "# nan the two tails separately\n",
    "Yc[sort_idx[:pnts2nan]]  = np.nan\n",
    "Yc[sort_idx[-pnts2nan:]] = np.nan\n",
    "\n",
    "# confirm the right numbers of points\n",
    "print(f'Total dataset size: {len(Yc)}')\n",
    "print(f'Valid dataset size: {np.sum(~np.isnan(Yc))}')"
   ],
   "metadata": {
    "id": "NiD8HdB3Vc1w"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# compute the mean and median (also used in the next exercise)\n",
    "meanY = np.mean(Y)\n",
    "medianY = np.median(Y)\n",
    "\n",
    "# print the means\n",
    "print(f'Mean of original: {meanY:.3f}')\n",
    "print(f'Mean of trimmed:  {np.nanmean(Yc):.3f}')\n",
    "\n",
    "# print the medians\n",
    "print(' ')\n",
    "print(f'Median of original: {medianY:.3f}')\n",
    "print(f'Median of trimmed:  {np.nanmedian(Yc):.3f}')"
   ],
   "metadata": {
    "id": "fvEE3pb4SnZP"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "ZSyh85bdSnV4"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 3"
   ],
   "metadata": {
    "id": "litynwuXYhMy"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# the range of k values\n",
    "ks = np.arange(1,50,3)\n",
    "\n",
    "# initialize a results matrix for mean/median\n",
    "results = np.zeros((len(ks),2))\n",
    "\n",
    "\n",
    "# the experiment!\n",
    "for idx,ki in enumerate(ks):\n",
    "\n",
    "  # make a new copy of the original data\n",
    "  Yc = Y.copy() # Note: Y was defined in Exercise 2\n",
    "\n",
    "  # convert that to a number of data points to remove from each tail\n",
    "  pnts2nan = int( (ki/2)/100 * N )\n",
    "\n",
    "  # nan the two tails separately\n",
    "  Yc[sort_idx[:pnts2nan]]  = np.nan\n",
    "  Yc[sort_idx[-pnts2nan:]] = np.nan\n",
    "\n",
    "  # collect mean and median\n",
    "  results[idx,0] = 100*(np.nanmean(Yc)-meanY) / meanY\n",
    "  results[idx,1] = 100*(np.nanmedian(Yc)-medianY) / medianY\n",
    "\n",
    "  print(f'Total/valid dataset size: {len(Yc)} -> {np.sum(~np.isnan(Yc))}')"
   ],
   "metadata": {
    "id": "1ZWxrctmYkGU"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# plot the results\n",
    "\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.plot(ks,results[:,0],'s-',color=[.6,.6,.6],markerfacecolor=[.8,.8,.8],markersize=10,label='Mean')\n",
    "plt.plot(ks,results[:,1],'o-',color='k',markerfacecolor=[.4,.4,.4],markersize=10,label='Median')\n",
    "plt.legend()\n",
    "plt.xlabel('k% to trim')\n",
    "plt.ylabel(r'Descriptive value (%$\\Delta$)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_ex3.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "xNxC34d0YkI1"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "JVvMFrZzwKDt"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 4"
   ],
   "metadata": {
    "id": "pTepB-T085Nv"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create data\n",
    "N = 1000\n",
    "X = np.random.f(5,100,size=N)\n",
    "\n",
    "# zscore data\n",
    "Xz = (X-np.mean(X)) / np.std(X,ddof=1)\n",
    "zThresh = 3\n",
    "\n",
    "# clean data\n",
    "Xclean = X[Xz<zThresh]\n",
    "\n",
    "# report number of removed data points\n",
    "print(f'Original sample size: {N}')\n",
    "print(f'Cleaned sample size:  {len(Xclean)}')\n",
    "print(f'Percent data removed: {100*(1-len(Xclean)/N):.2f}%')"
   ],
   "metadata": {
    "id": "xwiPUHOB85KW"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# histogram bins using FD rule\n",
    "y_fd_all = np.histogram(X,bins='fd')\n",
    "y_fd_clean = np.histogram(Xclean,bins='fd')\n",
    "\n",
    "# histogram bins using set boundaries\n",
    "# Note that I create the bin boundaries using k+1 numbers, then input that vector of boundaries into np.histogram\n",
    "edges = np.linspace(np.min(X),np.max(X),41)\n",
    "y_40_all = np.histogram(X,bins=edges)\n",
    "y_40_clean = np.histogram(Xclean,bins=edges)\n",
    "\n",
    "\n",
    "# plotting the histograms\n",
    "_,axs = plt.subplots(2,1,figsize=(8,7))\n",
    "axs[0].plot((y_fd_all[1][:-1]+y_fd_all[1][1:])/2,y_fd_all[0],'ks-',\n",
    "         label='Pre-cleaned',markersize=11,markerfacecolor=(.6,.6,.6))\n",
    "axs[0].plot((y_fd_clean[1][:-1]+y_fd_clean[1][1:])/2,y_fd_clean[0],'o--',color=(.6,.6,.6),\n",
    "         label='Cleaned',markersize=10,markerfacecolor=(.9,.9,.9))\n",
    "axs[0].set_title(r'$\\bf{A}$)  Histograms using F-D rule')\n",
    "\n",
    "axs[1].plot((y_40_all[1][:-1]+y_40_all[1][1:])/2,y_40_all[0],'ks-',\n",
    "         label='Pre-cleaned',markersize=11,markerfacecolor=(.6,.6,.6))\n",
    "axs[1].plot((y_40_clean[1][:-1]+y_40_clean[1][1:])/2,y_40_clean[0],'o--',color=(.6,.6,.6),\n",
    "         label='Cleaned',markersize=10,markerfacecolor=(.9,.9,.9))\n",
    "axs[1].set_title(r'$\\bf{B}$)  Histograms using 40 bins')\n",
    "\n",
    "\n",
    "# axis adjustments\n",
    "for a in axs:\n",
    "  a.legend()\n",
    "  a.set(xlabel='F value',ylabel='Count',\n",
    "        xlim=[np.min(X)-.02,np.max(X)+.02],xticks=range(6))\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_ex4.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "5mCUUqjW85Gu"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "EFX_7Phg85Au"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 5"
   ],
   "metadata": {
    "id": "cfLYXGp3wKGb"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# url reference: https://archive.ics.uci.edu/ml/datasets/Arrhythmia\n",
    "\n",
    "# import data\n",
    "df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data',\n",
    "                 usecols = np.arange(9),\n",
    "                 names   = ['age','sex','height','weight','qrs','p-r','q-t','t','p'])\n",
    "\n",
    "# inspect\n",
    "df.head()"
   ],
   "metadata": {
    "id": "N4XfDsPSAy00"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# boxplots of raw data\n",
    "plt.figure(figsize=(10,5))\n",
    "sns.boxplot(data=df).set(xlabel='Data feature',ylabel='Data value')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "ZbYZFnhBAy3l"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# make a copy of the original data matrix\n",
    "df_z = df.copy()\n",
    "\n",
    "for col in df_z.columns:\n",
    "  if not (col=='sex'):\n",
    "    df_z[col] = (df[col] - df[col].mean()) / df[col].std(ddof=1)\n",
    "\n",
    "# inspect again\n",
    "df_z"
   ],
   "metadata": {
    "id": "gsQ1CXmFAy50"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# box plots of z-scored data\n",
    "plt.figure(figsize=(10,5))\n",
    "sns.boxplot(data=df_z).set(xlabel='Data feature',ylabel='Data value')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "Uzbg2KU1Ay80"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# Note: this cell combines the previous graphs to make one figure for the book\n",
    "_,axs = plt.subplots(2,1,figsize=(10,7))\n",
    "sns.boxplot(data=df,  ax=axs[0]).set(xticks=[],ylabel='Data value')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Raw data')\n",
    "sns.boxplot(data=df_z,ax=axs[1]).set(xlabel='Data feature',ylabel='Transformed data value')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Z-transformed data')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_ex5b.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "eD_0DG0BwKJT"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "CFS8Vu7nxmtd"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 6"
   ],
   "metadata": {
    "id": "dQplY229xmc0"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# remove based on z-score threshold\n",
    "zThresh = 3.29 # p<.001\n",
    "\n",
    "df_clean = df.copy()\n",
    "df_clean[df_z>zThresh]  = np.nan # positive tail\n",
    "df_clean[df_z<-zThresh] = np.nan # negative tail"
   ],
   "metadata": {
    "id": "5C0lYd5_wKL8"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# plot\n",
    "_,axs = plt.subplots(2,1,figsize=(10,7))\n",
    "sns.boxplot(data=df,ax=axs[0]).set(xticks=[],ylabel='Data value')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Raw data')\n",
    "sns.boxplot(data=df_clean,ax=axs[1]).set(xlabel='Data feature',ylabel='Data value')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Cleaned data')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_ex6a.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "ZbE6LluswKOw"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# print the means\n",
    "raw_means = df.mean().values\n",
    "cleaned_means = df_clean.mean().values\n",
    "\n",
    "for name,pre,post in zip(df.columns,raw_means,cleaned_means):\n",
    "  print(f'{name:>6}: {pre:6.2f}  ->  {post:6.2f}')"
   ],
   "metadata": {
    "id": "unnB6HdcHj66"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# compute percent change\n",
    "pctchange = 100*(cleaned_means-raw_means) / raw_means\n",
    "\n",
    "# and plot\n",
    "plt.figure(figsize=(9,4))\n",
    "plt.plot(pctchange,'ks',markersize=14,markerfacecolor=(.7,.7,.7))\n",
    "plt.axhline(0,color='k',linewidth=2,zorder=-1)\n",
    "plt.xticks(range(9),labels=df.columns)\n",
    "plt.ylabel('Percent')\n",
    "plt.title('Change in feature means after z-score data rejection',loc='center')\n",
    "plt.grid()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('dataQC_ex6b.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "8H7P0Ml5J5GA"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "JG_BZFbKJ49Z"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}